
##################
## Solar Siting estimation 2012-end of 2015 only ("later years")

require(data.table)
require(iterators)
require(foreach)
require(parallel)
require(doSNOW)


cl <- snow::makeCluster(rep('localhost',40), type='SOCK')
registerDoSNOW(cl)



WORK = file.path('Z:/user/ajk41/Solar siting/Scripts/JAERE Code Repository/Data JAERE')
WORK.OUT = file.path(WORK, 'intermediate_output')
dir.create(WORK.OUT)

#### Get data 
load(file.path(WORK.OUT, 'usingWorkspace.Rdata'))




df.useMYS = split(rbindlist(map(df.split.useMYS, function(x) {
  x$ORISPL = rownames(x)
  return(x)}), idcol = 'INTX'), by='ymn')

## For v7, cut off anything <2012 ------- # see also line 106, cutting df.common.intx as well.

# since the data in df.common.intx is already de-meaned at the ym level, just cut based on ym's:
keep.ym.char = expand.grid(yy = 2012:2016, mm = 1:12) %>%
  dplyr::mutate(ym = paste0(yy,'-',mm)) %>%
  pull(ym)
cuttime = T
  
if(cuttime==T){
df.split = lapply(df.split, function(y){
  y[ym %in% keep.ym.char,]
})   }




empty.ORISPL = unlist(lapply(df.split, function(i) NROW(i)==0))
empty.ORISPL.names = names(df.split[empty.ORISPL])

df.useMYS = lapply(df.useMYS, function(xx){
  list(df = df.split[xx$ORISPL],
              ym = xx$ym[1],
              ymn = xx$ymn[1])
})

#### jDecompose function ####
jDecompose <- function(x){
  if(class(x)[1]%in%c('data.frame','data.table')) x = x$ym
  x = as.character(x)
  y = strsplit(x = x, split="---")
  z = unlist(strsplit(x=y[[1]][2], split="--"))
  return(data.frame(intx = y[[1]][1],
                    ym = z, stringsAsFactors=F))
} # takes the long ym value, parses out the intx, year-mo for the ym


# Cool. Merge in National NSRDB SOL_ --------------------------------------
WEST.names = gsub('PCA_', 'SOL_', unique(grep('PCA_', names(df.common.intx[['WEST']]), value=T)))
EAST.names = gsub('PCA_', 'SOL_', unique(grep('PCA_', names(df.common.intx[['EAST']]), value=T)))
TRE.names  = gsub('PCA_', 'SOL_', unique(grep('PCA_', names(df.common.intx[['TRE']] ), value=T)))

map.names = rbind(data.frame(INTX = 'W',NAME = WEST.names),
                  data.frame(INTX = 'E',NAME = EAST.names),
                  data.frame(INTX = 'T',NAME = TRE.names))
map.names$pcac = sapply(strsplit( as.character(map.names$NAME),'_[[:alnum:]]{1}'), '[', 2)


NN = readRDS(file.path(WORK, 'NSRDB_siting----2018-10-02.rds'))  # NSRDB requires manually querying and manually downloading NSRDB files and is not coded in R.
##--> some pcac (planning control area) mappings are combined below. This combines the solar for each subregion.
NN[pcac=='MROE',pcac:='MROW']
NN[pcac=='RFCM',pcac:='RFCW']
NN[pcac=='NEWE',pcac:='NYUP']
NN[pcac=='SPSO',pcac:='SPNO']
NN = merge(NN, map.names, by='pcac', all=F)
NNN = dcast(NN, formula = dh_e ~ NAME, fun.aggregate = sum,  value.var = 'solar_index', drop=T)




df.common.intx[['WEST']] = merge(df.common.intx[['WEST']], NNN[,grep('dh_e|_W', names(NNN), value=F), with=F], by = 'dh_e')
df.common.intx[['EAST']] = merge(df.common.intx[['EAST']], NNN[,grep('dh_e|_E', names(NNN), value=F), with=F], by = 'dh_e')
df.common.intx[['TRE']]  = merge(df.common.intx[['TRE']] , NNN[,grep('dh_e|_T', names(NNN), value=F), with=F], by = 'dh_e')

## 
# New: combine down high-variance eGrid subregions (solar and load)
df.common.intx[['EAST']][,c('PCA_EMROW'):=list(PCA_EMROE+PCA_EMROW)]
df.common.intx[['EAST']][,c('PCA_ERFCW'):=list(PCA_ERFCM+PCA_ERFCW)]
df.common.intx[['EAST']][,c('PCA_ENYUP'):=list(PCA_ENEWE+PCA_ENYUP)]
df.common.intx[['EAST']][,c('PCA_ESPNO'):=list(PCA_ESPSO+PCA_ESPNO)]
df.common.intx[['EAST']][,c('PCA_EMROE','PCA_ERFCM',
                            'PCA_ENEWE','PCA_ESPSO'):=NULL]






df.common.intx.B = copy(df.common.intx)

RES = foreach(df.iter = iter(df.useMYS, by='cell'), .errorhandling = 'pass', .inorder = F,
              .packages = c('data.table'), 
              .noexport = c('qr','wc','wdfc','wdfc.mm.g','wdfc.mm','why','cf','coef.results','Xs'),
              .verbose=T) %dopar% {
  hasSOL = T  # update later with conditional 
  jd = jDecompose(df.iter[['ym']])
  if(cuttime==T) {
    jd = jd[which(jd$ym %in% keep.ym.char),]
    if(NROW(jd)<=2) return(warning(paste0(df.iter$ymn,' has 1 or 0 months of data remaining after cut')))}
  
# With-SOL ----------------------------------------------------------------
  if(T==T){
  hasSOL = T
  
  wdfc = df.common.intx.B[[jd$intx[1]]][ym%in%jd$ym,][,SOL_WSPSO:=NULL]
  wdfc[is.na(wdfc)] = 0
  wdfc.PCAs = grep(names(wdfc), pattern='PCA_|SOL_', value=T)  #--> new v4: add "SOL_"

  wdfc[,(wdfc.PCAs):=lapply(.SD, function(y){ y - mean(y, na.rm=T)}), by = .(ym, hour, weekday), .SDcols = wdfc.PCAs]
  wdfc[,icpt:=1]
  wdfc = wdfc[,c('dh_e','hour','mo','icpt',wdfc.PCAs), with=F]
  
  # Check for within-group variance; can't invert a matrix where each within-group isn't zero, right?
  # Catalog all of the mo-hour-PCA combinations that have no variance and drop them from the
  # model matrix by name.
  wc = melt(wdfc[,lapply(.SD, var), by=.(mo, hour), .SDcols = wdfc.PCAs], id.vars = c('mo','hour'))[value<1,][,value:=NULL]  # value<1 usually, but dropping dh_e where solar has no var.
  wc[, dropvars:=paste0('as.factor(hour)',hour,':as.factor(mo)',mo,':',variable)]
  
  # Now, make the model matrix #
  PCAs.intx = paste0('as.factor(hour):as.factor(mo):',wdfc.PCAs)  # these are the coef. FE's
  reg.form = as.formula(paste(' ~',paste(PCAs.intx, collapse=' + ')))
  wdfc.mm = model.matrix(reg.form, data=wdfc)
  wdfc.mm = wdfc.mm[,setdiff(colnames(wdfc.mm), wc[,dropvars])]  # only keep the columns (intx vars) not in dropvars (those that have zero within-variance) )
  wdfc.mm = wdfc.mm[,apply(wdfc.mm, MAR=2, var)!=0]  #--> within some cluster-hour-month groupings, there is no variance
  #  dim(wdfc.mm)                                              ###---> hmm, is this an issue for demeanlist? dropping these? No...should be OK. just dropping cols, not row.
  
  if(dim(wdfc.mm)[[1]]<=dim(wdfc.mm)[[2]]) {
    print('Not enough obs'); stop}

  #  if('gpuR' %in% .packages(T)){
  if(exists('gpuMatrix')){
    gpu.data.type = 'float'
    if(dim(wdfc.mm)[[2]]<10000) gpu.data.type = 'double'
    cat(paste0('i=',' data type: ', gpu.data.type, '\n'))
    wdfc.mm.g = gpuMatrix(wdfc.mm, type=gpu.data.type)
    cat('gpuR used\n')
  } else {
    cat(paste0('i=', ' non-gpu data type\n'))
    wdfc.mm.g = as.matrix(wdfc.mm)
    cat('gpuR NOT used\n')
  }
  
  Xs = list(qr.SOL = qr(wdfc.mm.g))
  rm(wdfc.mm.g);rm(wdfc);rm(wdfc.mm)
  gc()
} # end ALL are T
  
cat('Finished QR on ');cat(df.iter$ymn);cat('\n')


coef.results = list()

for(a in 1:length(df.iter$df)){
  cat('ORISPL_');cat(df.iter$df[[a]]$ORISPL[1]);cat(' begins...\n')
    why = df.iter$df[[a]]
      Ys.vars = c('SO2_MASS','NOX_MASS','CO2_MASS','GLOAD_MASS')
      why[,(Ys.vars):=lapply(.SD, function(y){ y - mean(y, na.rm=T)}), by = .(ym), .SDcols = Ys.vars]

    for(ss in 1:length(Xs)){ # 1 if SOL only; 2 if SOL and no.SOL
      isSOL = names(Xs)[[ss]]
      cat(isSOL); cat('\n')
      cf = as.data.table(qr.coef(Xs[[ss]], as.matrix(why[,Ys.vars, with=F])),
                         keep.rownames = 'coefnames')

      cf[,c('hour','mo','PCA'):=tstrsplit(coefnames, ':')]
      cf[,hour:=tstrsplit(hour, ')', keep=2, type.convert=T)]
      cf[,  mo:=tstrsplit(mo  , ')', keep=2, type.convert=T)]
      cf[, c('ORISPL','coefnames','isSOL','ymn', 'intx'):=list(df.iter$df[[a]]$ORISPL[1],NULL, isSOL, df.iter$ymn, jd$intx[1])]
      cf = cf[order(mo, hour, PCA), .(ymn, intx, ORISPL, PCA, mo, hour, isSOL, SO2_MASS, NOX_MASS, CO2_MASS, GLOAD_MASS)]
      cf[,NumHoursUsed:=NROW(why)]
    coef.results[[length(coef.results)+1]] = cf
    } #

if(a%%3==0) gc()
} # end a-loop over ORISPL
rm(Xs); gc()
return(rbindlist(coef.results, fill=T)) 
} # end dopar




saveRDS(rbindlist(RES, fill=T), file.path(WORK.OUT, 'LaterYearsCoefs.rds')) 

